Anomaly Detection and Imbalanced Classification of patient clinical features to predict if they will have a stroke
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score, average_precision_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
path = 'healthcare-dataset-stroke-data.csv'
df = pd.read_csv(path)
df.head()
df.shape
Let's check if the dateset is balanced
df.stroke.value_counts()
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="stroke", data=df)
Looks like dataset is highly imabalanced in favor of non-stroke patients (> 20:1). Hence, the dataset will need to be balanced before building the model.
Let's check if the gender split is balanced
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="gender", data=df)
df.gender.value_counts()
The male/female split is not too imbalanced but we should get rid of the 'Other' outlier.
df.drop(df[df.gender == "Other"].index, inplace = True)
df.gender.value_counts()
df.shape
sns.set_theme(style="darkgrid")
sns.displot(df, x="age")
Patient age is distributed without any outliers.
df.age.describe()
Let's explore the remaining clinical features of the patients.
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="hypertension", data=df)
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="heart_disease", data=df)
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="ever_married", data=df)
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="work_type", data=df)
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Residence_type", data=df)
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="smoking_status", data=df)
sns.set_theme(style="darkgrid")
sns.displot(df, x="avg_glucose_level")
df.avg_glucose_level.describe()
Patient average glucose level is right skewed where the majority of patients have a glucose level on the lower end of the spectrum, hence majority of patients have a healthy glucose level. No visible outliers exist in this clinical feature
sns.set_theme(style="darkgrid")
sns.displot(df, x="bmi")
df.bmi.describe()
Patient BMI is normally distributed with some apparent outliers, which may or may not affect the model performance.
Let's check the stroke/no-stroke split once again before data cleaning
df.stroke.value_counts()
Let's clean the data by checking for missing values
df.isna().sum()
Drop missing values
df.dropna(inplace=True)
df.isna().sum()
We dropped all missing values
df.shape
Let's check the stroke/no-stroke split after data cleaning
df.stroke.value_counts()
First step is to drop the id column since it doesn't add any information to the dataset.
df.drop('id', axis=1, inplace = True)
df.shape
Let's build a baseline model with minimal feature engineering.
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
Let's encode the categorical variables
df.dtypes
Need to encode the object data types only.
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0,4,5,6,9])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
print(X)
X.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape
X_test.shape
Let's perform feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train
X_test
Let's build a pipeline of baseline classifiers with default parameters
def model_pipeline(X_tr, X_ts, y_tr, y_ts):
classifiers = [
{
'label': 'SVM',
'model': SVC(),
},
{
'label': 'KNN',
'model': KNeighborsClassifier(),
},
{
'label': 'DT',
'model': DecisionTreeClassifier(),
},
{
'label': 'RF',
'model': RandomForestClassifier(),
},
{
'label': 'Logistic Regression',
'model': LogisticRegression(),
},
{
'label': 'GNB',
'model': GaussianNB(),
},
{
'label': 'MLP',
'model': MLPClassifier(),
}
]
for clf in classifiers:
model = clf['model']
model.fit(X_tr, y_tr)
preds = model.predict(X_ts)
print(clf['label'] + ' Classification Report')
print(classification_report(y_ts,preds))
plot_confusion_matrix(model, X_ts, y_ts, display_labels=['Non-Stroke','Stroke'])
plt.grid(False)
plt.title('Confusion Matrix', fontweight='bold', fontsize=15)
plt.show()
plot_roc_curve(model, X_ts, y_ts)
plt.title('ROC Curve', fontweight='bold', fontsize=15)
plt.show()
plot_precision_recall_curve(model, X_ts, y_ts)
plt.title('Precision-Recall Curve', fontweight='bold', fontsize=15)
plt.show()
model_pipeline(X_train, X_test, y_train, y_test)
from collections import Counter
counter = Counter(y_test)
print(counter)
Since the dataset is imbalanced, accuracy is not a useful metric. Instead we will look look at precision and recall to evalute our classifier. For this particular application, the most important metric is recall since it is essential to correctly predict the patients who will get a stroke so that they can be put on medication in advance and avoid getting the stroke (preventive step). Precision is of lesser importance, but still significant, as it will prevent doctors from wrongly prescribing stroke prevention medications to patients who are not likely to have a stroke. So, ideally we want to maximize recall while maintaining a reasonable precision.
Only GNB gave a good recall score (96%) but with a low precision (7%). This is a very good recall score but we would like to significantly increase the precision.
Let's do some more feature engineering to improve precision without compromising recall.
Let's start with balancing the dataset as it is highly imbalanced. We will use SMOTE for this.
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_resampled, y_resampled = oversample.fit_resample(X_train, y_train)
counter1 = Counter(y_train)
counter2 = Counter(y_resampled)
print(counter1)
print(counter2)
X_resampled.shape
y_resampled.shape
X_test.shape
y_test.shape
We have upsampled the minority class in the training set by generating synthetic samples. Let's run the classifiers again on the new training data.
model_pipeline(X_resampled, X_test, y_resampled, y_test)
Precision went up very slightly for GNB as now we have 903 FPs compared to 920 FPs (before SMOTE) and recall stayed the same. We will keep the resampled training data and we will try a different strategy to improve precision significantly.
from sklearn.decomposition import PCA
pca = PCA(.99)
pca.fit(X_resampled)
train_pca = pca.transform(X_resampled)
test_pca = pca.transform(X_test)
print("Shape of resampled training data:", X_resampled.shape)
print("Shape of transformed training data :", train_pca.shape)
print("Shape of transformed testing data :", test_pca.shape)
Let's rerun the classifiers after dimensionality reduction.
model_pipeline(train_pca, test_pca, y_resampled, y_test)
Dimensionality reduction doubled the precision but significantly reduced the recall (from 96% to 83%). So, it makes the classifier perform worse and will not be used in model building.
Let's try Outlier Detection. We will start with Isolation Forest.
from sklearn.ensemble import IsolationForest
forr = IsolationForest(contamination=0.01)
forr_outliers = forr.fit_predict(X_resampled)
print(Counter(forr_outliers))
We have identified observations in the resampled training set that are outliers. Let's remove them.
X_removed = X_resampled[np.where(forr_outliers == 1, True, False)]
X_removed.shape
y_removed = y_resampled[np.where(forr_outliers == 1, True, False)]
y_removed.shape
Counter(y_removed)
Now, let's rerun the models and evaluate their performance after having removed outliers.
model_pipeline(X_removed, X_test, y_removed, y_test)
The best classifiers (GNB and Logistic Regression) were unaffected after removing outliers using Isolation Forest.
Let's try another outlier detection method, DBSCAN.
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps = 0.3, min_samples = 5)
db_outliers = dbscan.fit_predict(X_resampled)
print(Counter(db_outliers))
X_removed = X_resampled[np.where(db_outliers != -1, True, False)]
X_removed.shape
y_removed = y_resampled[np.where(db_outliers != -1, True, False)]
y_removed.shape
Counter(y_removed)
model_pipeline(X_removed, X_test, y_removed, y_test)
Removing outliers using DBSCAN significantly improves the stroke class recall for all algorithms, which is a good sign. Precision of the stroke class however is still very low and should be improved. SVM gave perfect recall but precision is still low.
DBSCAN seems to improve the recall significantly for all algorithms. Let's see if other outlier removal methods do the same by creating an outlier removal pipeline.
def outlier_pipeline(X,y, method, eps=0.3, min_samples=5, contam_iso=0.01,
n_neighbors=2, contam_lof=0.01, contam_cov=0.1):
if method == 'DBSCAN':
dbscan = DBSCAN(eps, min_samples)
print(dbscan.get_params())
print()
outliers = dbscan.fit_predict(X)
X_rem = X[np.where(outliers != -1, True, False)]
y_rem = y[np.where(outliers != -1, True, False)]
if method == 'Isolation Forest':
iso = IsolationForest(contamination=contam_iso)
print(iso.get_params())
print()
outliers = iso.fit_predict(X)
X_rem = X[np.where(outliers == 1, True, False)]
y_rem = y[np.where(outliers == 1, True, False)]
if method == 'LOF':
lof = LocalOutlierFactor(n_neighbors, contamination=contam_lof)
print(lof.get_params())
print()
outliers = lof.fit_predict(X)
X_rem = X[np.where(outliers == 1, True, False)]
y_rem = y[np.where(outliers == 1, True, False)]
if method == 'Robust Covariance':
cov = EllipticEnvelope(contamination=contam_cov)
print(cov.get_params())
print()
outliers = cov.fit_predict(X)
X_rem = X[np.where(outliers == 1, True, False)]
y_rem = y[np.where(outliers == 1, True, False)]
if method == 'One-Class SVM':
svm = OneClassSVM()
print(svm.get_params())
print()
outliers = svm.fit_predict(X)
X_rem = X[np.where(outliers == 1, True, False)]
y_rem = y[np.where(outliers == 1, True, False)]
print('Counts of each label after anomaly detection algorithm is applied:',
Counter(outliers))
print('Shape of training set before outlier removal:', X.shape)
print('Shape of training set after outlier removal:', X_rem.shape)
print('Number of labels after outlier removal:', len(y_rem))
print('Counts of the 2 classes before outlier removal:', Counter(y))
print('Counts of the 2 classes after outlier removal:', Counter(y_rem))
print()
return X_rem, y_rem
X_removed_db, y_removed_db = outlier_pipeline(X_resampled,y_resampled,'DBSCAN')
X_removed_iso, y_removed_iso = outlier_pipeline(X_resampled,y_resampled,'Isolation Forest')
X_removed_lof, y_removed_lof = outlier_pipeline(X_resampled,y_resampled,'LOF')
X_removed_cov, y_removed_cov = outlier_pipeline(X_resampled,y_resampled,'Robust Covariance')
X_removed_svm, y_removed_svm = outlier_pipeline(X_resampled,y_resampled,'One-Class SVM')
def summary_pipeline(X_tr, X_ts, y_tr, y_ts):
classifiers = [SVC(),
KNeighborsClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
LogisticRegression(),
GaussianNB(),
MLPClassifier()
]
result_table = pd.DataFrame(columns=['classifiers','recall','precision','f1-score','auc','ap'])
for clf in classifiers:
clf.fit(X_tr, y_tr)
preds = clf.predict(X_ts)
recall = recall_score(y_ts, preds)
precision = precision_score(y_ts, preds)
f1 = f1_score(y_ts, preds)
auc = roc_auc_score(y_ts, preds)
ap = average_precision_score(y_ts, preds)
tnr = recall_score(y_ts, preds, pos_label = 0)
result_table = result_table.append({'classifiers':clf.__class__.__name__,
'recall':recall,
'precision':precision,
'f1-score':f1,
'auc':auc,
'ap':ap,
'tnr':tnr}, ignore_index=True)
result_table.set_index('classifiers', inplace=True)
return result_table
display(model_pipeline(X_removed_db, X_test, y_removed_db, y_test))
display(summary_pipeline(X_removed_db, X_test, y_removed_db, y_test))
display(model_pipeline(X_removed_iso, X_test, y_removed_iso, y_test))
display(summary_pipeline(X_removed_iso, X_test, y_removed_iso, y_test))
display(model_pipeline(X_removed_lof, X_test, y_removed_lof, y_test))
display(summary_pipeline(X_removed_lof, X_test, y_removed_lof, y_test))
display(model_pipeline(X_removed_cov, X_test, y_removed_cov, y_test))
display(summary_pipeline(X_removed_cov, X_test, y_removed_cov, y_test))
display(model_pipeline(X_removed_svm, X_test, y_removed_svm, y_test))
display(summary_pipeline(X_removed_svm, X_test, y_removed_svm, y_test))
An alternative metric to precision that we can use is specificity (or True Negative Rate), which tells us how well the model is able to detect the negative class (i.e. non-stroke patients). Attempting to increase specificity, without compromising sensitivity (i.e. recall), will decrease the false positives and maintain the low false negatives that is required for this application.
We can also try alternative ways of handling missing data and balancing the data (undersampling, etc.)
path = 'healthcare-dataset-stroke-data.csv'
df_new = pd.read_csv(path)
df_new.head()
df_new.shape
df_new.stroke.value_counts()
df_new.isna().sum()
Let's try a different approach of handling the missing data so that we don't lose any information.
df_new.bmi.describe()
sns.displot(df_new, x="bmi")
BMI is approximately normally distributed with a small standard deviation. So, we will impute the missing values with the mean.
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df_new.bmi = imp.fit_transform(df_new[['bmi']]).ravel()
df_new.isna().sum()
df_new.shape
df_new.stroke.value_counts()
df_new.gender.value_counts()
df_new.drop(df_new[df_new.gender == "Other"].index, inplace = True)
df_new.gender.value_counts()
No information was lost in the cleaning process.
def data_preprocessing(data, test_size=0.3, random_state=42):
dropped_df = data.drop('id', axis=1)
X = dropped_df.iloc[:, :-1].values
y = dropped_df.iloc[:, -1].values
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0,4,5,6,9])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = data_preprocessing(df_new)
display(model_pipeline(X_train, X_test, y_train, y_test))
display(summary_pipeline(X_train, X_test, y_train, y_test))
oversample = SMOTE()
X_resampled, y_resampled = oversample.fit_resample(X_train, y_train)
display(Counter(y_train))
display(Counter(y_resampled))
X_train.shape
X_resampled.shape
display(model_pipeline(X_resampled, X_test, y_resampled, y_test))
display(summary_pipeline(X_resampled, X_test, y_resampled, y_test))
X_removed_db, y_removed_db = outlier_pipeline(X_resampled,y_resampled,'DBSCAN')
X_removed_iso, y_removed_iso = outlier_pipeline(X_resampled,y_resampled,'Isolation Forest')
X_removed_lof, y_removed_lof = outlier_pipeline(X_resampled,y_resampled,'LOF')
X_removed_cov, y_removed_cov = outlier_pipeline(X_resampled,y_resampled,'Robust Covariance')
X_removed_svm, y_removed_svm = outlier_pipeline(X_resampled,y_resampled,'One-Class SVM')
display(model_pipeline(X_removed_db, X_test, y_removed_db, y_test))
display(summary_pipeline(X_removed_db, X_test, y_removed_db, y_test))
display(model_pipeline(X_removed_iso, X_test, y_removed_iso, y_test))
display(summary_pipeline(X_removed_iso, X_test, y_removed_iso, y_test))
display(model_pipeline(X_removed_lof, X_test, y_removed_lof, y_test))
display(summary_pipeline(X_removed_lof, X_test, y_removed_lof, y_test))
display(model_pipeline(X_removed_cov, X_test, y_removed_cov, y_test))
display(summary_pipeline(X_removed_cov, X_test, y_removed_cov, y_test))
display(model_pipeline(X_removed_svm, X_test, y_removed_svm, y_test))
display(summary_pipeline(X_removed_svm, X_test, y_removed_svm, y_test))
Replacing the missing data with the mean instead of removing it reduces the predictive capability of the models. So, we will go back to the original method of handling the missing data.